# Create optimized analysis dataset
analysis_data <- merged_data %>%
# Filter for complete data
filter(!is.na(pce), !is.na(prevent_base_ascvd_risk)) %>%
filter(!is.na(CAD), !is.na(CVD)) %>% # Require PRS
filter(!is.na(f.22009.0.1), !is.na(f.22009.0.2), !is.na(f.22009.0.3)) %>%
filter(time_accel_to_mi > 0, time_accel_to_af > 0,
time_accel_to_hf > 0, time_accel_to_stroke > 0) %>%
mutate(
# Time variables (already in years)
time_to_mi_years = time_accel_to_mi,
time_to_af_years = time_accel_to_af,
time_to_hf_years = time_accel_to_hf,
time_to_stroke_years = time_accel_to_stroke,
# Standardize accelerometer metrics
mvpa_z = scale(mvpa_daily_total)[,1],
sed_z = scale(sed_daily_total)[,1],
# Standardize clinical risk scores
pce_z = scale(pce)[,1],
prevent_z = scale(prevent_base_ascvd_risk)[,1],
risk_combined = (scale(pce)[,1] + scale(prevent_base_ascvd_risk)[,1]) / sqrt(2),
# Standardize PRS (already z-scores, but re-standardize in this cohort)
prs_cad_z = scale(CAD)[,1],
prs_cvd_z = scale(CVD)[,1],
prs_af_z = scale(AF)[,1],
prs_ht_z = scale(HT)[,1], # Hypertension PRS (can use for HF models)
prs_stroke_z = scale(stroke)[,1],
prs_t2d_z = scale(T2D)[,1],
# Combined genetic risk for CVD outcomes
prs_cvd_combined = (scale(CAD)[,1] + scale(CVD)[,1]) / sqrt(2),
# Interaction terms: Activity × Risk
mvpa_x_pce = mvpa_z * pce_z,
mvpa_x_prevent = mvpa_z * prevent_z,
mvpa_x_prs_cad_z = mvpa_z * prs_cad_z,
mvpa_x_prs_cvd_z = mvpa_z * prs_cvd_z,
mvpa_x_prs_af_z = mvpa_z * prs_af_z,
# Combined risk interaction
mvpa_x_risk_comb = mvpa_z * risk_combined,
mvpa_x_prs_comb = mvpa_z * prs_cvd_combined,
# Covariates (standardized)
age_z = scale(age_accel)[,1],
bmi_z = scale(bmi)[,1],
pc1 = scale(f.22009.0.1)[,1],
pc2 = scale(f.22009.0.2)[,1],
pc3 = scale(f.22009.0.3)[,1],
sex = factor(sex, levels = c("Female", "Male")),
# Risk categories for visualization
prs_cad_tertile = cut(prs_cad_z,
breaks = quantile(prs_cad_z, c(0, 0.33, 0.67, 1)),
labels = c("Low PRS", "Medium PRS", "High PRS")),
pce_tertile = cut(pce_z,
breaks = quantile(pce_z, c(0, 0.33, 0.67, 1)),
labels = c("Low PCE", "Medium PCE", "High PCE"))
)
cat("\n=== FINAL ANALYSIS DATASET ===\n")